Preanalysis

Set project directory

# directory <- "X:/mchung/christina_yek_02/pipeline_output/"
directory <- "X:/mchung/sasha_mushegian_12/pipeline_output/"

print(directory)
## [1] "X:/mchung/sasha_mushegian_12/pipeline_output/"

Load packages

library(bayestestR)
library(ComplexHeatmap)
library(DT)
library(ggplot2)
library(matrixStats)
library(randomcoloR)
library(reshape2)
library(plotly)

sessionInfo()
## R version 4.5.0 (2025-04-11 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 10 x64 (build 19045)
## 
## Matrix products: default
##   LAPACK version 3.12.1
## 
## locale:
## [1] LC_COLLATE=English_United States.utf8 
## [2] LC_CTYPE=English_United States.utf8   
## [3] LC_MONETARY=English_United States.utf8
## [4] LC_NUMERIC=C                          
## [5] LC_TIME=English_United States.utf8    
## 
## time zone: America/New_York
## tzcode source: internal
## 
## attached base packages:
## [1] grid      stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
## [1] plotly_4.10.4         reshape2_1.4.4        randomcoloR_1.1.0.1  
## [4] matrixStats_1.5.0     ggplot2_3.5.2         DT_0.33              
## [7] ComplexHeatmap_2.24.0 bayestestR_0.16.0    
## 
## loaded via a namespace (and not attached):
##  [1] gtable_0.3.6        circlize_0.4.16     shape_1.4.6.1      
##  [4] rjson_0.2.23        xfun_0.52           bslib_0.9.0        
##  [7] htmlwidgets_1.6.4   GlobalOptions_0.1.2 insight_1.3.0      
## [10] vctrs_0.6.5         tools_4.5.0         generics_0.1.4     
## [13] stats4_4.5.0        curl_6.2.2          parallel_4.5.0     
## [16] tibble_3.2.1        cluster_2.1.8.1     pkgconfig_2.0.3    
## [19] data.table_1.17.2   RColorBrewer_1.1-3  S4Vectors_0.46.0   
## [22] lifecycle_1.0.4     compiler_4.5.0      farver_2.1.2       
## [25] stringr_1.5.1       codetools_0.2-20    clue_0.3-66        
## [28] htmltools_0.5.8.1   sass_0.4.10         yaml_2.3.10        
## [31] lazyeval_0.2.2      tidyr_1.3.1         pillar_1.10.2      
## [34] crayon_1.5.3        jquerylib_0.1.4     cachem_1.1.0       
## [37] iterators_1.0.14    foreach_1.5.2       tidyselect_1.2.1   
## [40] digest_0.6.37       Rtsne_0.17          stringi_1.8.7      
## [43] dplyr_1.1.4         purrr_1.0.4         fastmap_1.2.0      
## [46] colorspace_2.1-1    cli_3.6.5           magrittr_2.0.3     
## [49] withr_3.0.2         scales_1.4.0        rmarkdown_2.29     
## [52] httr_1.4.7          png_0.1-8           GetoptLong_1.0.5   
## [55] evaluate_1.0.3      knitr_1.50          IRanges_2.42.0     
## [58] V8_6.0.3            doParallel_1.0.17   viridisLite_0.4.2  
## [61] rlang_1.1.6         Rcpp_1.0.14         glue_1.8.0         
## [64] BiocGenerics_0.54.0 rstudioapi_0.17.1   jsonlite_2.0.0     
## [67] R6_2.6.1            plyr_1.8.9

Load taxa counts

bad_taxa <- c("Homo sapiens","synthetic construct")

counts <- list(taxa = read.delim(paste0(directory,"/tables/counts.taxa.bracken.tsv"),
                      check.names = F))
counts$taxa <- counts$taxa[!(rownames(counts$taxa) %in% bad_taxa),]

Load functional counts

counts$module <- read.delim(paste0(directory,"/tables/counts.ko.module.fmap.tsv"),
                            quote = "",check.names = F)
counts$ortho <- read.delim(paste0(directory,"/tables/counts.ko.ortho.fmap.tsv"),
                           quote = "",check.names = F)
counts$pathway <- read.delim(paste0(directory,"/tables/counts.ko.pathway.fmap.tsv"),
                             quote = "",check.names = F)

Subset max 300 samples for viewing

set.seed(123456)
if(ncol(counts$taxa) >= 300){
  subset <- sort(sample(colnames(counts$taxa),300))
}else{
  subset <- colnames(counts$taxa)
}

Taxonomic Overview

Sample Counts

df <- as.data.frame(cbind(colnames(counts$taxa),
                          colSums(counts$taxa)))
names(df) <- c("sample","counts")
df$counts <- as.numeric(as.character(df$counts))

datatable(df,rownames = F)

Sample Distribution

Bracken Taxa

fig <- plot_ly(x=colnames(counts$taxa[subset]),y=colSums(counts$taxa[subset]),color=I("blue"),
               name="Taxa",
               type="bar")
fig <- fig %>% layout(title = "Bracken Taxa Counts",
                      yaxis = list(title = "counts"))
fig
df <- df[df[,1] %in% subset,]

p <- ggplot()+
  geom_density(aes(x=df$counts),fill="blue")+
  geom_rug(aes(x=df$counts,label=df$sample),color="blue")+
  stat_boxplot(geom = "vline", aes(xintercept = 0))+
  labs(x="counts")+
  theme_bw()

fig <- ggplotly(p)
fig

Taxonomic Profiling

relab <- as.matrix(apply(counts$taxa[subset],2,function(x){x/sum(x)*100}))
top_taxa <- rownames(relab[order(-rowMedians(relab)),])[1:50]

set.seed(446)
col <- distinctColorPalette(length(top_taxa))

relab.top_taxa <- as.data.frame(relab[top_taxa,])
relab.top_taxa["Other",] <- 100-colSums(relab.top_taxa)

df <- reshape2::melt(as.matrix(relab.top_taxa))
df[,1] <- factor(df[,1],levels=c(top_taxa,"Other"))
colnames(df) <- c("taxa","sample","relab")

p <- ggplot(df,aes(x=sample,y=relab,fill=taxa))+
  geom_bar(stat="identity")+
  scale_fill_manual(values=c(rev(col),"grey"))+
  scale_y_continuous(expand=c(0,0))+
  labs(y="relative abundance",fill="Taxa")+
  theme_bw()+
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

fig <- ggplotly(p,width=1200,height=800)
fig

Taxonomic Quantification

kuniq <- list(cov = read.delim(paste0(directory,"/tables/cov.taxa.krakenuniq.tsv"),
                                check.names = F),
              dup = read.delim(paste0(directory,"/tables/dup.taxa.krakenuniq.tsv"),
                                check.names = F),
              kmers = read.delim(paste0(directory,"/tables/kmers.taxa.krakenuniq.tsv"),
                                  check.names = F))
kuniq <- lapply(kuniq,function(x){return(x[colnames(counts$taxa)])})

Bracken Taxa

df <- reshape2::melt(as.matrix(relab[top_taxa,subset]))
df[,1] <- factor(df[,1],levels=rev(top_taxa))
colnames(df) <- c("taxa","sample","relab")

p <- ggplot(df,aes(y=relab,x=taxa,fill=taxa))+
  geom_boxplot()+
  scale_fill_manual(values=col)+
  labs(y="relative abundance")+
  theme_bw()+
  theme(axis.title.y = element_blank(),
        legend.position='none')+
  coord_flip()

fig <- ggplotly(p,width=800,height=800)
fig

KrakenUniq Statistics

KrakenUniq Coverage

df <- reshape2::melt(as.matrix(kuniq$cov[top_taxa,subset]))
df[,1] <- factor(df[,1],levels=rev(top_taxa))
colnames(df) <- c("taxa","sample","cov")

p <- ggplot(df,aes(y=cov,x=taxa,fill=taxa))+
  geom_boxplot()+
  scale_fill_manual(values=col)+
  labs(y="relative abundance")+
  theme_bw()+
  theme(axis.title.y = element_blank(),
        legend.position='none')+
  coord_flip()

fig <- ggplotly(p,width=800,height=800)
fig

KrakenUniq Duplicates

df <- reshape2::melt(as.matrix(kuniq$dup[top_taxa,subset]))
df[,1] <- factor(df[,1],levels=rev(top_taxa))
colnames(df) <- c("taxa","sample","dup")

p <- ggplot(df,aes(y=dup,x=taxa,fill=taxa))+
  geom_boxplot()+
  scale_fill_manual(values=col)+
  labs(y="relative abundance")+
  theme_bw()+
  theme(axis.title.y = element_blank(),
        legend.position='none')+
  coord_flip()

fig <- ggplotly(p,width=800,height=800)
fig

KrakenUniq Kmers

df <- reshape2::melt(as.matrix(kuniq$kmers[top_taxa,subset]))
df[,1] <- factor(df[,1],levels=rev(top_taxa))
colnames(df) <- c("taxa","sample","kmers")

p <- ggplot(df,aes(y=kmers,x=taxa,fill=taxa))+
  geom_boxplot()+
  scale_fill_manual(values=col)+
  labs(y="relative abundance")+
  theme_bw()+
  theme(axis.title.y = element_blank(),
        legend.position='none')+
  coord_flip()

fig <- ggplotly(p,width=800,height=800)
fig

Functional Profiling

Sample Distribution

FMAP Pathway KOs

fig <- plot_ly(x=colnames(counts$pathway[subset]),y=colSums(counts$pathway[subset]),color=I("red"),
               name="Pathway KOs",
               type="bar")
fig <- fig %>% layout(title = "FMAP Pathway KO Counts",
                      yaxis = list(title = "counts"))
fig
df <- as.data.frame(cbind(colnames(counts$pathway[subset]),
                          colSums(counts$pathway[subset])))
names(df) <- c("sample","counts")
df$counts <- as.numeric(as.character(df$counts))

p <- ggplot()+
  geom_density(aes(x=df$counts),fill="red")+
  geom_rug(aes(x=df$counts,label=df$sample),color="red")+
  stat_boxplot(geom = "vline", aes(xintercept = 0))+
  labs(x="counts")+
  theme_bw()

fig <- ggplotly(p)
fig

FMAP Ortho KOs

fig <- plot_ly(x=colnames(counts$ortho[subset]),y=colSums(counts$ortho[subset]),color=I("green"),
               name="Ortho KOs",
               type="bar")
fig <- fig %>% layout(title = "FMAP Ortho KO Counts",
                      yaxis = list(title = "counts"))
fig
df <- as.data.frame(cbind(colnames(counts$ortho[subset]),
                          colSums(counts$ortho[subset])))
names(df) <- c("sample","counts")
df$counts <- as.numeric(as.character(df$counts))

p <- ggplot()+
  geom_density(aes(x=df$counts),fill="green")+
  geom_rug(aes(x=df$counts,label=df$sample),color="green")+
  stat_boxplot(geom = "vline", aes(xintercept = 0))+
  labs(x="counts")+
  theme_bw()

fig <- ggplotly(p)
fig

FMAP Module KOs

fig <- plot_ly(x=colnames(counts$module[subset]),y=colSums(counts$module[subset]),color=I("orange"),
               name="Module KOs",
               type="bar")
fig <- fig %>% layout(title = "FMAP Module KO Counts",
                      yaxis = list(title = "counts"))
fig
df <- as.data.frame(cbind(colnames(counts$module[subset]),
                          colSums(counts$module[subset])))
names(df) <- c("sample","counts")
df$counts <- as.numeric(as.character(df$counts))

p <- ggplot()+
  geom_density(aes(x=df$counts),fill="orange")+
  geom_rug(aes(x=df$counts,label=df$sample),color="orange")+
  stat_boxplot(geom = "vline", aes(xintercept = 0))+
  labs(x="counts")+
  theme_bw()

fig <- ggplotly(p)
fig

Functional Quantification

FMAP Pathway KOs

df <- counts$pathway[subset]

relab <- apply(df,2,function(x){x/sum(x)*100})
top_kos <- rownames(relab)[order(-rowMedians(relab))][1:50]

df <- reshape2::melt(as.matrix(df[top_kos,]))
df[,1] <- factor(df[,1],levels=rev(top_kos))
colnames(df) <- c("kos","sample","kmers")

p <- ggplot(df,aes(y=kmers,x=kos))+
  geom_boxplot(fill="red")+
  labs(y="relative abundance")+
  theme_bw()+
  theme(axis.title.y = element_blank(),
        legend.position='none')+
  coord_flip()

fig <- ggplotly(p,width=800,height=800)
fig

FMAP Ortho KOs

df <- counts$ortho[subset]

relab <- apply(df,2,function(x){x/sum(x)*100})
top_kos <- rownames(relab)[order(-rowMedians(relab))][1:50]

df <- reshape2::melt(as.matrix(df[top_kos,]))
df[,1] <- factor(df[,1],levels=rev(top_kos))
colnames(df) <- c("kos","sample","kmers")

p <- ggplot(df,aes(y=kmers,x=kos))+
  geom_boxplot(fill="green")+
  labs(y="relative abundance")+
  theme_bw()+
  theme(axis.title.y = element_blank(),
        legend.position='none')+
  coord_flip()

fig <- ggplotly(p,width=800,height=800)
fig

FMAP Module KOs

df <- counts$module[subset]

relab <- apply(df,2,function(x){x/sum(x)*100})
top_kos <- rownames(relab)[order(-rowMedians(relab))][1:50]

df <- reshape2::melt(as.matrix(df[top_kos,]))
df[,1] <- factor(df[,1],levels=rev(top_kos))
colnames(df) <- c("kos","sample","kmers")

p <- ggplot(df,aes(y=kmers,x=kos))+
  geom_boxplot(fill="orange")+
  labs(y="relative abundance")+
  theme_bw()+
  theme(axis.title.y = element_blank(),
        legend.position='none')+
  coord_flip()

fig <- ggplotly(p,width=800,height=800)
fig